Dominic Boccaleri Data Mining 1 Decision Tree/Naive Bayes
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import os
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
os.getcwd()
data = pd.read_csv("cs-training.csv")
data.head()
data.loc[data["NumberOfTime30-59DaysPastDueNotWorse"] <=10, "times_delinquent30-59"] = 2
data.loc[data["NumberOfTime30-59DaysPastDueNotWorse"] <=3, "times_delinquent30-59"] = 1
data.loc[data["NumberOfTime30-59DaysPastDueNotWorse"] >10, "times_delinquent30-59"] = 3
data["times_delinquent30-59"].value_counts()
data.loc[data["NumberOfTime60-89DaysPastDueNotWorse"] <=10, "times_delinquent60-89"] = 2
data.loc[data["NumberOfTime60-89DaysPastDueNotWorse"] <=3, "times_delinquent60-89"] = 1
data.loc[data["NumberOfTime60-89DaysPastDueNotWorse"] >10, "times_delinquent60-89"] = 3
data["times_delinquent60-89"].value_counts()
data.loc[data["NumberOfTimes90DaysLate"] <=10, "times_delinquent90"] = 2
data.loc[data["NumberOfTimes90DaysLate"] <=3, "times_delinquent90"] = 1
data.loc[data["NumberOfTimes90DaysLate"] >10, "times_delinquent90"] = 3
data["times_delinquent90"].value_counts()
data.loc[data["NumberOfDependents"] <=14, "catNumberOfDependents"] = 2
data.loc[data["NumberOfDependents"] <=2, "catNumberOfDependents"] = 1
data.loc[data["NumberOfDependents"] >4, "catNumberOfDependents"] = 3
data["catNumberOfDependents"].value_counts()
data.loc[data["NumberRealEstateLoansOrLines"] <=10, "catNumberRealEstateLoansOrLines"] = 2
data.loc[data["NumberRealEstateLoansOrLines"] <=3, "catNumberRealEstateLoansOrLines"] = 1
data.loc[data["NumberRealEstateLoansOrLines"] >10, "catNumberRealEstateLoansOrLines"] = 3
data["catNumberRealEstateLoansOrLines"].value_counts()
data.loc[data["NumberOfOpenCreditLinesAndLoans"] <=10, "catNumberOfOpenCreditLinesAndLoans"] = 2
data.loc[data["NumberOfOpenCreditLinesAndLoans"] <=3, "catNumberOfOpenCreditLinesAndLoans"] = 1
data.loc[data["NumberOfOpenCreditLinesAndLoans"] >10, "catNumberOfOpenCreditLinesAndLoans"] = 3
data["catNumberOfOpenCreditLinesAndLoans"].value_counts()
data.loc[data["MonthlyIncome"] <=800000, "catMonthlyIncome"] = 2
data.loc[data["MonthlyIncome"] <=30000, "catMonthlyIncome"] = 1
data.loc[data["MonthlyIncome"] >80000, "catMonthlyIncome"] = 3
data["catMonthlyIncome"].value_counts()
data.loc[data["DebtRatio"] <=.66, "catDebtRatio"] = 2
data.loc[data["DebtRatio"] <=.33, "catDebtRatio"] = 1
data.loc[data["DebtRatio"] >.66, "catDebtRatio"] = 3
data["catDebtRatio"].value_counts()
data.loc[data["age"] <=50, "catage"] = 2
data.loc[data["age"] <=30, "catage"] = 1
data.loc[data["age"] >50, "catage"] = 3
data["catage"].value_counts()
data.loc[data["SeriousDlqin2yrs"] > 0, "catSeriousDlqin2yrs"] = 1
data.loc[data["SeriousDlqin2yrs"] < 1, "catSeriousDlqin2yrs"] = 0
data["catSeriousDlqin2yrs"].value_counts()
data.loc[data["RevolvingUtilizationOfUnsecuredLines"] <=.6, "catRevolvingUtilizationOfUnsecuredLines"] = 2
data.loc[data["RevolvingUtilizationOfUnsecuredLines"] <=.3, "catRevolvingUtilizationOfUnsecuredLines"] = 1
data.loc[data["RevolvingUtilizationOfUnsecuredLines"] >.6, "catRevolvingUtilizationOfUnsecuredLines"] = 3
data["catRevolvingUtilizationOfUnsecuredLines"].value_counts()
df = data[["catMonthlyIncome", "times_delinquent30-59", "times_delinquent60-89", "times_delinquent90",
"catNumberOfDependents", "catNumberRealEstateLoansOrLines", "catNumberOfOpenCreditLinesAndLoans",
"catDebtRatio", "catage", "catSeriousDlqin2yrs", "catRevolvingUtilizationOfUnsecuredLines"]]
df.head()
df.dropna()
df = df.dropna()
X = df[["times_delinquent30-59", "times_delinquent60-89", "times_delinquent90",
"catNumberOfDependents", "catNumberRealEstateLoansOrLines", "catNumberOfOpenCreditLinesAndLoans",
"catDebtRatio", "catage", "catSeriousDlqin2yrs", "catRevolvingUtilizationOfUnsecuredLines"]]
Y = df[["catMonthlyIncome"]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(Y_test, Y_pred))
dot_data = StringIO()
feature_columns = ["times_delinquent30-59", "times_delinquent60-89", "times_delinquent90",
"catNumberOfDependents", "catNumberRealEstateLoansOrLines", "catNumberOfOpenCreditLinesAndLoans",
"catDebtRatio", "catage", "catSeriousDlqin2yrs", "catRevolvingUtilizationOfUnsecuredLines"]
export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True,
feature_names=feature_columns, class_names=["0","1","2"])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("MonthlyIncome.png")
Image(graph.create_png())
#Import the Naive Bayes Model
from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.naive_bayes import MultinomialNB as mnb
#Create a classifer to run the model Gaussian NB
modelgnb = gnb()
#Train the model GNB
modelgnb.fit(X_train,Y_train)
#Predict the response
y_predgnb = modelgnb.predict(X_test)
#Check the Accuracy
print("Accuracy:", metrics.accuracy_score(Y_test, y_predgnb))
#Create a classifier to run the model Multinomial NB
modelmnb = mnb()
#Train the model MNB
modelmnb.fit(X_train, Y_train)
#Predict the response
y_predmnb = modelmnb.predict(X_test)
#Check the Accuracy
print("Accuracy:", metrics.accuracy_score(Y_test, y_predmnb))
For the homework, I found a dataset with features of credit-style info like number of times a person was delinquent on their bills, how much a person makes per month, and how many inquiries a person has made in the last two years. The dataset was from kaggle.com (https://www.kaggle.com/c/GiveMeSomeCredit/overview). I did some preprocessing where I binned/ categorized the data as to not have so many data points. I decided to focus on the monthly income a person makes I made a decision tree based on the features of the data set and based on the training data (with a 20% split for testing) we predicted the monthly income with 99.3% accuracy. After the decision tree, I made a Naïve Bayes model using the Gaussian Naïve Bayes model. This particular model proved to be relatively worthless to use with this model and underperformed with only 42.4% accuracy. Then I tried a Multinomial Naïve Bayes model and it performed very well. It tested at 99.3% accuracy using the training split. Comparing the models, the decision tree and the Multinomial Naïve Bayes models performed extremely well with the data, but could possibly be due to some over preprocessing or over categorizing.
This is the output .png image. This particular file would not make the file due to a issue with windows and the graphviz/pydotplus libraries. I was unable to get it to work but the code (which is labeled) does run when the libraries are installed properly on OSX and Anaconda.
The type of Naive Bayes depends on which type of distribution is being used in the model, be it a Gaussian distribution or a Multinomial distribution.
The type of distribution refers to which distribution the model will use as the posterior function of the Bayesian model.
The posterior refers to the P(x|data)=P(data|x)*P(x)/P(data) and the distribution we choose will model the left side where we predict the responses given the data .